Data Sources

The following links below contain publicly available data about A&E attendances across Scottish Hospitals published by Public Health Scotland. The data used was obtained as of the 26th May 2024

The monthly data on A&E attendances can be obtained from here: https://www.opendata.nhs.scot/dataset/monthly-accident-and-emergency-activity-and-waiting-times/resource/022c3b27-6a58-48dc-8038-8f1f93bb0e78

The weekly data on A&E waiting times can be obtained from here:

https://www.opendata.nhs.scot/dataset/weekly-accident-and-emergency-activity-and-waiting-times/resource/a5f7ca94-c810-41b5-a7c9-25c18d43e5a4

# Set the working directory
setwd("/Users/nng3/Masters/Semester 3/Project 1")
# Function to compute the MSE for demand for A&E
mse.load <- function(model, test_data){
    pred <- cbind(test_data, predict.bam(model,test_data,se=TRUE,type="response"))
    mse <- mean((pred$total_attendances-pred$fit)^2)
    return(mse)
}

# Function to compute the RMSE for hourly a&e attendances
rmse.load.hr <- function(model, test_data){
    pred <- predict.bam(model,test_data,se=TRUE,type="response")
    rmse <- sqrt(mean((test_data$attendances-pred$fit)^2))
    return(rmse)
}

# Function to compute the RMSE for waiting times
rmse.wait <- function(model, test_data){
    pred <- cbind(test_data, predict.bam(model,test_data,se=TRUE,type="response"))
    rmse <- sqrt(mean((test_data$prop.w4hrs-pred$fit)^2))
    return(rmse)
}

# Function to compute the baseline RMSE for waiting times
wait.baseline.rmse <- function(train_data, test_data) {
  # Compute the last year's (364 days ago) julian day for the test data
  train_data_baseline <- train_data %>%
    filter(julian %in% (unique(test_data$julian) - 364)) %>%
    select(loc, julian, prop.w4hrs) %>%
    rename(last_julian = julian)
  
  # Compute the last julian in the test data to join on
  test_data <- test_data %>%
    mutate(last_julian = julian - 364)
  
  # Merge baseline forecast with test data
  baseline_forecast <- merge(test_data, train_data_baseline, by = c("last_julian", "loc"))
  
  # Calculate RMSE of the baseline
  rmse <- sqrt(mean((baseline_forecast$prop.w4hrs.x - baseline_forecast$prop.w4hrs.y)^2, na.rm = TRUE))
  
  return(rmse)
}

# Function to calculate the baseline RMSE
load.baseline.rmse <- function(train_data, test_data) {
  # Filter the baseline data based on cmonth - baseline_period
  baseline_data <- train_data %>%
    filter(cmonth %in% (unique(test_data$cmonth) - 12))
  
  baseline_forecast <- merge(test_data, baseline_data, by = c("loc", "day.num", "hr.num","month"), suffixes = c(".test", ".baseline"))

  # Calculate RMSE of the baseline
  rmse <- sqrt(mean((baseline_forecast$attendances.test - baseline_forecast$attendances.baseline)^2, na.rm = TRUE))
  
  return(rmse)
}

# Function to sum every 168 observations in forecast data
sum_168 <- function(x) {
    tapply(x, (seq_along(x)-1) %/% 168, sum)
}

# Function to sum every 6th element starting from different initial points
sum_by_position <- function(vec) {
  result <- numeric(6)
  
  # Loop through the positions (1 to 6) and sum the corresponding elements
  for (i in 1:6) {
    indices <- seq(i, 180, by = 6)
    result[i] <- sum(vec[indices])
  }
  
  return(result)
}

Data Cleaning and Preprocessing

# Read data and drop the country variable (only one unique obs: "S92000003") and department type for the waiting times dataset
load <- read.csv("~/Masters/Semester 3/Project 1/weekly_load.csv")[,-2]
wait <- read.csv("~/Masters/Semester 3/Project 1/weekly_wait.csv")[,-2]

# Changing col names
names(load) <- c("monthyr", "HBT", "loc", "dpttype", "day", "week", "hr", "inout", "attendances")
names(wait) <- c("wed", "HBT", "loc", "dpttype","attendances", "w4hrs", "gr4hrs", "per.w4hrs", "gr8hrs", "per.gr8hrs", "gr12hrs", "per.gr12hrs")

# Changing observation names for readability 
load$dpttype[load$dpttype == "Emergency Department"] <- "AE" 
load$dpttype[load$dpttype == "Minor Injury Unit or Other"] <- "MIU" 
wait$dpttype[wait$dpttype == "Emergency Department"] <- "AE" 
wait$dpttype[wait$dpttype == "Minor Injury Unit or Other"] <- "MIU" 

# Filter to focus only the AE department not MIU
load <- load[load$dpttype=="AE",]
wait <- wait[wait$dpttype=="AE",]

# Convert the date variable
wait$date <- as.Date(as.character(wait$wed), format("%Y%m%d"))
# Adjust the date index for load 
# Change the hours 00:00-00:59 for example to start at 0 to 23
load$hr.num <- as.numeric(sub(":.*", "", load$hr))

# Change the day to 1,2,...,7 days of the week
day_to_num <- c("Friday"=5, "Monday"=1, "Saturday"=6, "Sunday"=7,   "Thursday"=4, "Tuesday"=2, "Wednesday"=3)
load$day.num <- day_to_num[load$day]

# Segregate month and year into separate columns
load$year <- as.numeric(substr(load$monthyr, 1, 4))
load$month <- as.numeric(substr(load$monthyr, 5, 6))
wait$year <- as.numeric(substr(wait$date, 1, 4))
wait$month <- as.numeric(substr(wait$date, 6, 7))
wait$dom <- as.numeric(substr(wait$date, 9, 10))
# Specify the number of days from the start day 
wait$julian <- julian(wait$date, origin=as.Date("2015-02-22"))

# Create factor variables
load$HBT <- as.factor(load$HBT)
wait$HBT <- as.factor(wait$HBT)
wait$loc <- as.factor(wait$loc)
load$inout <- as.factor(load$inout)
load$loc <- as.factor(load$loc)

# Create proportions
wait$per.gr4hrs <- round((wait$gr4hrs/wait$attendances)*100,1)
wait$prop.w4hrs <- wait$w4hrs/wait$attendances
wait$prop.gr4hrs <- wait$gr4hrs/wait$attendances
wait$bet4to8hrs <- wait$gr4hrs - wait$gr8hrs 
wait$prop.bet4to8hrs <- wait$bet4to8hrs/wait$attendances
wait$bet8to12hrs <- wait$gr8hrs - wait$gr12hrs
wait$prop.bet8to12hrs <- wait$bet8to12hrs/wait$attendances
wait$prop.gr12hrs <- wait$gr12hrs/wait$attendances

Impute the zeros

We noticed that observations with no A&E attendances were dropped, so we must impute them back into the dataset.

# Order the data by day 
dup <- load[order(load$loc, load$year, load$month, load$day.num), ]
# List wise segregation
split_dup <- split(dup, list(dup$loc, dup$year, dup$month, dup$day.num), drop = TRUE)

hr_vect <- seq(0,23)
# Return missing data
check <- lapply(split_dup, function(df) {
    if(length(df$hr.num) != 24) {
        return(df)
    }
}
)
# Drop the null 
check <- check[!sapply(check, is.null)]

# Create new rows in the dataframe containing 0 attendances
dup <- lapply(check, function(df) {
    loc <- which(!(hr_vect %in% df$hr.num))
    new_row <- df[rep(1,length(loc)),]
    new_row$hr.num <- c(loc-1)
    return(new_row)
})

# Combine the list back into a dataframe
sam <- do.call(rbind, dup)
sam$attendances <- 0
load <- rbind(load, sam)

# Cumulative months and years
load$cmonth <- (load$year-2018)*12 + load$month
load$chr <- (load$day.num-1)*24 + load$hr.num
rm(split_dup)
rm(check)
rm(dup)

Remove duplicates and incomplete data

# Check which hospitals have more or less than 482 observations
locw <- unique(wait$loc)
num <- rep(0, length(locw))
for(i in 1:length(locw)) {
    num[i] <- nrow(wait[wait$loc == locw[i],])
}
locw[which(num != 482)]
## [1] G405H G306H G516H
## 32 Levels: A111H A210H B120H C121H C313H C418H F704H G107H G306H ... Z102H
# Chose to drop "G306H" "G516H" and inspect "G405H"
id <- which(wait$loc == "G405H" & wait$date == "2015-05-03")
# Sum together attendances and remove duplicates
wait$attendances[id[2]] <- wait$attendances[333] + wait$attendances[336] 
wait$w4hrs[id[2]] <- wait$w4hrs[id[1]] + wait$w4hrs[id[2]]
wait$gr4hrs[id[2]] <- wait$gr4hrs[id[1]] + wait$gr4hrs[id[2]]
wait$gr8hrs[id[2]] <- wait$gr8hrs[id[1]] + wait$gr8hrs[id[2]]
wait$gr12hrs[id[2]] <- wait$gr12hrs[id[1]] + wait$gr12hrs[id[2]]
wait$per.w4hrs[id[2]] <- round(wait$w4hrs[id[2]]/wait$attendances[id[2]]*100, 1)
wait$per.gr8hrs[id[2]] <- round(wait$gr8hrs[id[2]]/wait$attendances[id[2]]*100, 1)
wait$per.gr12hrs[id[2]] <- round(wait$gr12hrs[id[2]]/wait$attendances[id[2]]*100, 1)
wait <- wait[-id[1],]

# Remove "G306H" "G516H" from the waiting times data
wait <- subset(wait, loc %in% unique(wait$loc)[-c(16,17)])

Modelling Covid

The dates in which we put the indicator or covid-19 were chosen based on the dates published here: https://www.covid19inquiry.scot/covid-19-pandemic-scotland-timeline-key-dates however the end date seems to be arbitrary choice which was chosen based on the shift in attention towards other emergencies e.g., war in Ukraine affecting gas prices.

# Create an indicator variable for COVID-19 years
covid <- seq.Date(from=as.Date("2020-03-01"),to=as.Date("2022-03-01"), by="month")
covid <- gsub("-","",covid)
covid <- substr(covid, 1, nchar(covid)-2)
load$covid <- ifelse(load$monthyr %in% covid, 1, 0)

covid_start <- as.Date("2020-03-26")
covid_end <- as.Date("2022-03-26")
wait$covid <- ifelse(wait$date >= covid_start & wait$date <= covid_end, 1, 0)

Exploratory Data Analysis

A&E Demand

# Grouping load by month of the year to plot A&E demand by location
load.month <- load %>%
  group_by(loc, year, month, day.num, cmonth, covid) %>%
  summarize(total_attendances = sum(attendances, na.rm = TRUE))
## `summarise()` has grouped output by 'loc', 'year', 'month', 'day.num',
## 'cmonth'. You can override using the `.groups` argument.
load.month.plot <- load %>%
  group_by(loc, year, month, cmonth, covid) %>%
  summarize(total_attendances = sum(attendances, na.rm = TRUE))
## `summarise()` has grouped output by 'loc', 'year', 'month', 'cmonth'. You can
## override using the `.groups` argument.
load.month.plot <- load.month.plot %>%
  mutate(yearmon = as.yearmon(paste(year, month, sep = "-")))
last_dates <- load.month.plot %>%
  group_by(loc) %>%
  filter(yearmon == max(yearmon)) %>%
  ungroup()

# Create the plot for each A&E demand for each treatment location 
p <- ggplot(load.month.plot, aes(x = yearmon, y = total_attendances, color = loc)) +
  geom_line(linewidth = 0.3) +
  labs(title = "Total A&E Attendances for all Treatment Locations in Scotland",
       x = "Date (Year-Month)", 
       y = "Total Attendances") +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10,margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15))
  ) +
  geom_text_repel(data = last_dates, aes(label = loc), 
                  nudge_x = 0.4, size = 4, 
                  show.legend = FALSE, direction = "y",
                  segment.size = 0.2, segment.color = "grey50") +
  geom_vline(xintercept = as.yearmon("2020-03"), linetype = "dashed", color = "red", size = 0.5) +  # Lockdown line
  annotate("text", x = as.yearmon("2020-03"), y = max(load.month.plot$total_attendances), 
           label = "First Nationwide COVID-19 lockdown", color = "red", vjust = -1, hjust = -0.1, size = 3) +  # Annotation for lockdown
  scale_x_yearmon(format = "%Y-%m", n = 10) +  
  expand_limits(x = c(min(load.month.plot$yearmon), max(load.month.plot$yearmon)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)

filename <- "a&eallloc.png"
ggsave(filename, plot = p, width = 10, height = 6, units = "in", dpi = 300)
# Plot weekly cycle for each month for the first 6 years in each location
par(mfrow = c(2,3))
loc <- unique(load$loc[load$dpttype != "MIU"])
for (j in loc) {
data <- load[load$loc == j,]
for (k in 2018:2023) {
yl <- range(data$attendances)
with(data[data$year==k&data$month==1,], plot(chr[order(chr)], attendances[order(chr)], type="l", ylim = yl, xlab = "Hour of the week", ylab = unique(data$loc), main = k))
for (m in 2:12) with(data[data$year==k&data$month==m,], lines(chr[order(chr)], attendances[order(chr)], col=m))
}
readline()
}

# Cumulative hour for locations
load.cm.plot <- load %>%
  filter(loc %in% c("S314H", "T101H", "G405H"), year %in% c(2018, 2020, 2023), month %in% c(1, 4, 7, 10))

# Create the ggplot for the filtered data
e <- ggplot(load.cm.plot, aes(x = chr, y = attendances, color = factor(month))) +
  geom_line(size = 0.6) +
  facet_grid(scales = "free_y",rows = vars(year), cols = vars(loc)) + 
  labs(title = "Cumulative Hourly Attendances for Selected Locations and Years",
       x = "Cumulative Hour",
       y = "Attendances",
       color = "Month") +
  theme_minimal() +
  theme(
    axis.title.x = element_text(margin = margin(t = 15)),
    legend.position = "bottom",
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    strip.background = element_blank(),
    panel.spacing = unit(1, "lines"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
  )

# Print the combined plot
print(e)

filename <- "chr_someloc.png"
ggsave(filename, plot = e, width = 10, height = 6, units = "in", dpi = 300)
# Heatmap of month against cumulative hour
load_agg_data <- load %>%
  group_by(month, chr) %>%
  filter(year != 2024) %>%
  summarise(total_attendances = sum(attendances))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# Reshape the data into a wide format for the heatmap
heatmap_data <- dcast(load_agg_data, chr ~ month, value.var = "total_attendances")

# Plotting the heatmap
pp <- ggplot(melt(heatmap_data, id.vars = "chr"), aes(x = variable, y = chr, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "plasma", na.value = "white") +
  labs(x = "Month", y = "Cumulative Hour", fill = "Attendances") +
  ggtitle("Heatmap of Total A&E Attendances by Cumulative Hour and Month") +
  theme_minimal() + 
  theme(
    axis.title.x = element_text(margin = margin(t = 15)),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.title = element_text(size = 10),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    strip.background = element_blank(),
    panel.spacing = unit(1, "lines"),
  )

print(pp)

filename <- "heat.png"
ggsave(filename, plot = pp, width = 10, height = 6, units = "in", dpi = 300)

Waiting Times

# Plot percentage of attendances within 4hours
fewew <- ggplot(wait, aes(x = date, y=per.w4hrs)) +
  geom_line() +
  facet_wrap(~ loc, ncol=5) +
  labs(title = "Percentage of Wait Times by within 4 hours from the week of 2015-02-22 to 2024-05-12", y = "Percentage of Attendances within 4 hours",
       x="Week end date") +
  theme_minimal() +
  theme(
    axis.title.x = element_text(margin = margin(t = 15)),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.title = element_text(size = 10),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    strip.background = element_blank(),
    panel.spacing = unit(1, "lines"),
  )

print(fewew)

ggsave("allwaitloc.png", plot = fewew, width = 10, height = 6, units = "in", dpi = 300)
# Define the specific locations to expand the plot
some_loc <- c("S314H", "C418H", "T101H", "C121H")
wait_some_loc <- filtered_wait <- wait %>%
  filter(loc %in% some_loc)

q <- ggplot(filtered_wait, aes(x = date, y = per.w4hrs, color = loc)) +
  geom_line(size = 0.3) +
  labs(
    title = "Percentage of Attendances within 4 Hours for Selected Locations",
    x = "Date",
    y = "Percentage of Attendances within 4 Hours",
    color = "ISD Hospital Location Codes"  
  ) +
  geom_vline( # Vertical line indicating when covid-19 began
    xintercept = as.Date("2020-03-26"), 
    linetype = "dashed", 
    color = "red", 
    size = 0.5
  ) +
  annotate(
    "text", 
    x = as.Date("2020-03-01"), 
    y = max(filtered_wait$per.w4hrs, na.rm = TRUE), 
    label = "First Nationwide COVID-19 lockdown", 
    color = "red", 
    vjust = -1, 
    hjust = -0.1, 
    size = 3
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15))
  )

print(q)

filename <- "wait_someloc.png"
ggsave(filename, plot = q, width = 10, height = 6, units = "in", dpi = 300)
# Histogram plot to show the distribution of the data
# Distribution of waiting times 
d <- ggplot() +
  geom_histogram(data = wait, aes(x = gr4hrs, fill = "Greater than 4 hours"), bins = 50, color = "black", alpha = 0.5) +
  geom_histogram(data = wait, aes(x = w4hrs, fill = "Within 4 hours"), bins = 50, color = "black", alpha = 0.5) +
  labs(title = "Histogram of weekly A&E wait times",
       x = "Value", 
       y = "Frequency") +
  scale_fill_manual(values = c("skyblue", "orange"),
                    name = "Category",
                    labels = c("Greater than 4 hours", "Within 4 hours")) +
  coord_cartesian(xlim = c(0, 2000)) +
  theme_minimal() +
  theme(
    axis.title.x = element_text(margin = margin(t = 15)),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    panel.spacing = unit(1, "lines"),
    legend.position = "bottom",  
    legend.box = "horizontal",
    legend.key.size = unit(0.4, "cm")
  )

# Distribution of A&E attendances
d1 <- ggplot(load, aes(x = attendances)) +
  geom_histogram(bins = 50, alpha = 0.5, color = "black") +
  labs(title = "Histogram of A&E attendances",
       x = "Value", 
       y = "Frequency") +
  theme_minimal() +
  theme(
    axis.title.x = element_text(margin = margin(t = 15)),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    panel.spacing = unit(1, "lines"),
  )

d2 <- grid.arrange(d, d1, ncol = 2)

print(d2)
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
# filename <- "hist.png"
# ggsave(filename, plot = d2, width = 10, height = 6, units = "in", dpi = 300)

Model fitting

Hourly A&E Demand

# Create a rolling window using 6 months for testing and the rest for training
train.load.1 <- load %>% filter(cmonth <= 67)
train.load.2 <- load %>% filter(cmonth >= 2, cmonth <= 68)
train.load.3 <- load %>% filter(cmonth >= 3, cmonth <= 69)

test.load.1 <- load %>% filter(cmonth >= 68, cmonth <= 73)
test.load.2 <- load %>% filter(cmonth >= 69, cmonth <= 74)
test.load.3 <- load %>% filter(cmonth >= 70)

# Train each model specification on each rolling window and examine performance on test dataset
list.train.load <- list(train.load.1, train.load.2, train.load.3)
list.test.load <- list(test.load.1, test.load.2, test.load.3)
knots <- list(day.num=c(1,8), hr.num=c(0,24), chr=c(0,168), month=c(0,12))

# Initialize lists
hr.rmse.results <- list(m1 = list(), m2 = list(), m3 = list())
m1.hr.fit <- list(); m2.hr.fit <- list(); m3.hr.fit <- list()
# Fitting model 1
for (i in 1:3) {
  m1.hr.fit[[i]] <- bam(attendances ~ loc + covid + s(day.num, k=7, bs="cc") + s(chr, k=80, bs="ad") + s(hr.num, k=20, bs="cc") + te(cmonth, month, k=c(40,10), bs=c("cr","cc")), family = tw, data = list.train.load[[i]], method= "fREML",knots = knots, discrete = TRUE)
  
  hr.rmse.results$m1[[i]] <- list(rmse = rmse.load.hr(m1.hr.fit[[i]], list.test.load[[i]]), data_trained_on = paste("train.load.", i, sep = ""))
}
# Fiting model 2
for (i in 1:3) {
  m2.hr.fit[[i]] <- bam(attendances ~ loc + covid + as.factor(inout) + s(year, k=6, bs="cr") + s(chr, k=100, bs="ad") + s(hr.num, k=20, bs="cc") + te(cmonth, month, k=c(40,10), bs=c("cr","cc")), 
family = tw, data = list.train.load[[i]], knots = knots, discrete = TRUE)

  hr.rmse.results$m2[[i]] <- list(rmse = rmse.load.hr(m2.hr.fit[[i]], list.test.load[[i]]), data_trained_on = paste("train.load.", i, sep = ""))
}
# Fitting model 3
for (i in 1:3) {
  m3.hr.fit[[i]] <- bam(attendances ~ s(loc, k=30, bs="re") + covid + inout + te(chr, hr.num, k=c(100,20), bs=c("cc","cc")) + te(cmonth, month, k=c(40,10), bs=c("cr","cc")),family = tw, data = list.train.load[[i]], knots = knots, discrete = TRUE)
  
  hr.rmse.results$m3[[i]] <- list(rmse = rmse.load.hr(m3.hr.fit[[i]], list.test.load[[i]]), data_trained_on = paste("train.load.", i, sep = ""))
}

Model Assessment

# Print RMSE results for each model
print(hr.rmse.results)
## $m1
## $m1[[1]]
## $m1[[1]]$rmse
## [1] 7.363933
## 
## $m1[[1]]$data_trained_on
## [1] "train.load.1"
## 
## 
## $m1[[2]]
## $m1[[2]]$rmse
## [1] 8.183022
## 
## $m1[[2]]$data_trained_on
## [1] "train.load.2"
## 
## 
## $m1[[3]]
## $m1[[3]]$rmse
## [1] 7.41899
## 
## $m1[[3]]$data_trained_on
## [1] "train.load.3"
## 
## 
## 
## $m2
## $m2[[1]]
## $m2[[1]]$rmse
## [1] 7.420417
## 
## $m2[[1]]$data_trained_on
## [1] "train.load.1"
## 
## 
## $m2[[2]]
## $m2[[2]]$rmse
## [1] 7.713513
## 
## $m2[[2]]$data_trained_on
## [1] "train.load.2"
## 
## 
## $m2[[3]]
## $m2[[3]]$rmse
## [1] 7.305569
## 
## $m2[[3]]$data_trained_on
## [1] "train.load.3"
## 
## 
## 
## $m3
## $m3[[1]]
## $m3[[1]]$rmse
## [1] 7.360321
## 
## $m3[[1]]$data_trained_on
## [1] "train.load.1"
## 
## 
## $m3[[2]]
## $m3[[2]]$rmse
## [1] 8.184336
## 
## $m3[[2]]$data_trained_on
## [1] "train.load.2"
## 
## 
## $m3[[3]]
## $m3[[3]]$rmse
## [1] 7.414629
## 
## $m3[[3]]$data_trained_on
## [1] "train.load.3"
# Calculate and print mean RMSE for each model specification
load.mean.rmse <- list(
  m1 = mean(sapply(hr.rmse.results$m1, function(x) x$rmse)),
  m2 = mean(sapply(hr.rmse.results$m2, function(x) x$rmse)),
  m3 = mean(sapply(hr.rmse.results$m3, function(x) x$rmse))
)

print(load.mean.rmse)
## $m1
## [1] 7.655315
## 
## $m2
## [1] 7.479833
## 
## $m3
## [1] 7.653095
# Compute the baseline forecast
baseline.rmse.load.1 <- load.baseline.rmse(train.load.1, test.load.1)
baseline.rmse.load.2 <- load.baseline.rmse(train.load.2, test.load.2)
baseline.rmse.load.3 <- load.baseline.rmse(train.load.3, test.load.3)

baseline.rmse.load.1; baseline.rmse.load.2; baseline.rmse.load.3
## [1] 7.849163
## [1] 7.788533
## [1] 7.910597
# Average the baselines
(baseline.rmse.load.1 + baseline.rmse.load.2 + baseline.rmse.load.3)/3
## [1] 7.849431
# Compare AIC of the two models with RMSE lower than the baseline
load.aic <- list(
  m2 = sapply(m2.hr.fit, function(x) AIC(x)),
  m3 = sapply(m3.hr.fit, function(x) AIC(x))
)
print(load.aic)
## $m2
## [1] 1861597 1973448 1975227
## 
## $m3
## [1] 1971333 1972571 1974363
appraise(m1.hr.fit[[1]])

appraise(m1.hr.fit[[2]])

appraise(m1.hr.fit[[3]])

appraise(m2.hr.fit[[1]])

appraise(m2.hr.fit[[2]])

appraise(m2.hr.fit[[3]])

appraise(m3.hr.fit[[1]])

appraise(m3.hr.fit[[2]])

appraise(m3.hr.fit[[3]])

6 Month ahead prediction

# Create new prediction dataframe
load.hr.forecast <- load[load$month %in% c(4:9) & load$year == 2023,]
oldcmon <- sort(unique(load.hr.forecast$cmonth))
newcmon <- 76:81
for(i in 1:6) {
    load.hr.forecast$cmonth[load.hr.forecast$cmonth == oldcmon[i]] <- newcmon[i]
}
load.hr.forecast$year <- 2024; load.hr.forecast <- load.hr.forecast[,-9]
# Retrain model with all data
m2.hr.full <- bam(attendances ~ loc + covid + inout + s(year, k=6, bs="cr") + s(chr, k=100, bs="ad") + s(hr.num, k=20, bs="cc") + te(cmonth, month, k=c(40,10), bs=c("cr","cc")), 
family = tw, data = load, knots = knots, discrete = TRUE)

load.hr.forecast <- cbind(load.hr.forecast, predict.bam(m2.hr.full,load.hr.forecast,type="response"))
names(load.hr.forecast)[16] <- "fit"
draw(m2.hr.full, rug = NULL)

summary(m2.hr.full); appraise(m2.hr.full)
## 
## Family: Tweedie(p=1.105) 
## Link function: log 
## 
## Formula:
## attendances ~ loc + covid + inout + s(year, k = 6, bs = "cr") + 
##     s(chr, k = 100, bs = "ad") + s(hr.num, k = 20, bs = "cc") + 
##     te(cmonth, month, k = c(40, 10), bs = c("cr", "cc"))
## 
## Parametric coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)        3.260430   0.033943   96.057  < 2e-16 ***
## locA210H          -0.654967   0.003577 -183.087  < 2e-16 ***
## locB120H          -0.758566   0.003687 -205.722  < 2e-16 ***
## locC121H          -2.071722   0.005814 -356.353  < 2e-16 ***
## locC313H          -0.768445   0.003698 -207.790  < 2e-16 ***
## locC418H          -0.069657   0.003073  -22.666  < 2e-16 ***
## locF704H           0.013497   0.003016    4.475 7.63e-06 ***
## locG107H           0.289653   0.002847  101.741  < 2e-16 ***
## locG405H           0.394168   0.002791  141.225  < 2e-16 ***
## locG513H           0.025097   0.003008    8.343  < 2e-16 ***
## locH103H          -1.993416   0.005641 -353.381  < 2e-16 ***
## locH202H          -0.595155   0.003517 -169.225  < 2e-16 ***
## locH212H          -1.965699   0.005581 -352.182  < 2e-16 ***
## locL106H           0.033876   0.003002   11.284  < 2e-16 ***
## locL302H          -0.031269   0.003046  -10.264  < 2e-16 ***
## locL308H           0.108727   0.002954   36.807  < 2e-16 ***
## locN101H          -0.149163   0.003131  -47.640  < 2e-16 ***
## locN121H          -1.413557   0.004560 -310.024  < 2e-16 ***
## locN411H          -0.904659   0.003854 -234.705  < 2e-16 ***
## locR103H          -2.295874   0.006348 -361.695  < 2e-16 ***
## locS308H          -0.136461   0.003122  -43.715  < 2e-16 ***
## locS314H           0.619116   0.002684  230.672  < 2e-16 ***
## locS319H          -0.237528   0.003199  -74.249  < 2e-16 ***
## locT101H          -0.187135   0.003160  -59.223  < 2e-16 ***
## locT202H          -1.043429   0.004027 -259.110  < 2e-16 ***
## locV217H          -0.068233   0.003072  -22.210  < 2e-16 ***
## locW107H          -2.425733   0.006685 -362.844  < 2e-16 ***
## locY144H          -1.628212   0.004922 -330.819  < 2e-16 ***
## locY146H          -0.692757   0.003617 -191.545  < 2e-16 ***
## locZ102H          -2.177201   0.006057 -359.428  < 2e-16 ***
## covid              0.121637   0.055701    2.184    0.029 *  
## inoutOut of Hours  0.010586   0.005946    1.780    0.075 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df        F p-value    
## s(year)           1.011  1.011    0.894   0.342    
## s(chr)           62.024 68.775  335.443  <2e-16 ***
## s(hr.num)        17.955 18.000 1827.486  <2e-16 ***
## te(month,cmonth) 71.377 71.973  603.829  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.89   Deviance explained = 89.8%
## fREML = 5.9391e+05  Scale est. = 1.2431    n = 378000

# Obtain confidence intervals by simulating from the posterior
# Reorder the data
load.hr.forecast <- load.hr.forecast[order(load.hr.forecast$loc, load.hr.forecast$cmonth, load.hr.forecast$chr),]

# Obtain lp matrix and simulate from the posterior
X_P <- predict(m2.hr.full, load.hr.forecast, type="lpmatrix")
n <- 5000
br <- rmvn(n, coef(m2.hr.full), vcov(m2.hr.full))

# Compute the linear predictor
tmp <- list()
for(i in 1:n) {
    l_p <- X_P %*% br[i,]
    tmp[[i]] <- exp(l_p)
}

# Sum the observations by hospital location and cmonth
sums_list <- lapply(tmp, sum_168)
identify <- load.hr.forecast %>%
    group_by(loc, cmonth) %>%
    summarise()
## `summarise()` has grouped output by 'loc'. You can override using the `.groups`
## argument.
forecast.CI.mat <- do.call(cbind,sums_list)
forecast.CI.df <- as.data.frame(forecast.CI.mat)
forecast.CI.df <- cbind(identify, forecast.CI.df)

# Compute credible interval
quantiles <- apply(forecast.CI.df[,3:5002],1,function(x) quantile(x,c(0.025,0.975)))
identify$lower <- quantiles[1,]
identify$upper <- quantiles[2,]

# Sum forecasts and merge to get the CI
load.mon.forecast <- load.hr.forecast %>% 
    group_by(loc, cmonth, year, month) %>%
    summarise(total_attendances = sum(fit)) %>%
    mutate(yearmon = as.yearmon(paste(year, month, sep="-")))
## `summarise()` has grouped output by 'loc', 'cmonth', 'year'. You can override
## using the `.groups` argument.
load.mon.forecast <- merge(load.mon.forecast, identify, by=c("loc", "cmonth"))
# Identifer dataframe
iden.scotland <- load.hr.forecast %>%
    group_by(cmonth) %>%
    summarise()

# Obtain the credible interval for the whole of scotland 
scotland_load <- lapply(sums_list, sum_by_position)

forecast.CI.scotland <- do.call(cbind,scotland_load)
forecast.CI.scotland.df <- as.data.frame(forecast.CI.scotland)
forecast.CI.scotland.df <- cbind(iden.scotland, forecast.CI.scotland.df)

# Compute credible interval
quantiles <- apply(forecast.CI.scotland.df[,2:5001],1,function(x) quantile(x,c(0.025,0.975)))
iden.scotland$lower <- quantiles[1,]
iden.scotland$upper <- quantiles[2,]

# Sum forecasts and merge to get the CI
load.scotland.forecast <- load.hr.forecast %>% 
    group_by(cmonth, year, month) %>%
    summarise(total_attendances = sum(fit)) %>%
    mutate(yearmon = as.yearmon(paste(year, month, sep="-")))
## `summarise()` has grouped output by 'cmonth', 'year'. You can override using
## the `.groups` argument.
load.scotland.forecast <- merge(load.scotland.forecast, iden.scotland, by=c("cmonth"))

Plotting the forecast

# Transform all historical data into monthly totals
load.month.plot <- load %>%
  group_by(loc, year, month, cmonth, covid) %>%
  summarize(total_attendances = sum(attendances, na.rm = TRUE)) %>%
  mutate(yearmon = as.yearmon(paste(year,month,sep="-")))
## `summarise()` has grouped output by 'loc', 'year', 'month', 'cmonth'. You can
## override using the `.groups` argument.
# Combine historical data with forecasts along with the CI
load.month.hr.full <- bind_rows(
  load.month.plot %>% mutate(source = "Actual"),
  load.mon.forecast %>% select(loc, yearmon, total_attendances, lower, upper) %>% 
    mutate(source = "Forecast")
)

# Filter the data to use historical data from 2023
load.month.hr.full.2023 <- load.month.hr.full %>%
  filter(yearmon >= as.yearmon("2023-01"))
# Create the plot
qq <- ggplot(load.month.hr.full.2023, aes(x = yearmon, y = total_attendances, color = source)) +
  geom_line(data = load.month.hr.full.2023 %>% filter(source == "Actual"), size = 0.3) +
  geom_line(data = load.month.hr.full.2023 %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") +
  geom_ribbon(data = load.month.hr.full.2023 , aes(x = yearmon, ymin = lower, ymax = upper, fill = loc), alpha = 0.2, inherit.aes = FALSE) +
  labs(
    title = "Total A&E Attendances and Forecasts by Location",
    x = "Date (Year-Month)",
    y = "Total Attendances"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",  
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15)),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)  
  ) +
  scale_x_yearmon(format = "%Y-%m", n = 10, breaks = seq(min(load.month.hr.full.2023$yearmon), max(load.month.hr.full.2023$yearmon), by = 6/12)) +  
  facet_wrap(~ loc, scales = "free_y", ncol = 6)  

print(qq)

# Save the plot
ggsave("demand_forecast_all.png", plot = qq, width = 12, height = 8, dpi = 300)
# Select some locations to plot
load.plot.some.2023 <- load.month.hr.full.2023 %>% filter(loc %in% c("S314H", "C418H", "T101H", "C121H"))

load.plot.some.forecast <- load.plot.some.2023 %>% filter(source == "Forecast")
load.plot.some.actual <- load.plot.some.2023 %>% filter(source == "Actual")

qqq <- ggplot(load.plot.some.2023 , aes(x = yearmon, y = total_attendances, color = loc)) +
  geom_line(data = load.plot.some.actual, aes(color = loc), linewidth = 0.3) +  # Actual data lines
  geom_line(data = load.plot.some.forecast, aes(color = loc), linewidth = 0.3, linetype = "dashed") +  # Forecast data lines
  geom_ribbon(data = load.plot.some.2023 , aes(x = yearmon, ymin = lower, ymax = upper, fill = loc), 
              alpha = 0.2, inherit.aes = FALSE) +  # Credible intervals
  labs(title = "Monthly Forecast for A&E Attendances in Selected Treatment Locations in Scotland",
       x = "Date (Year-Month)", 
       y = "Total Attendances") +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15))
  ) +
  scale_x_yearmon(format = "%Y-%m", n = 10, breaks = seq(min(load.plot.some.2023$yearmon), max(load.plot.some.2023$yearmon), by = 4/12)) + 
  expand_limits(x = c(min(load.plot.some.2023$yearmon), max(load.plot.some.2023$yearmon))) +
  facet_wrap(~ loc, scales = "free_y")  # Create separate panels for each location

# Print the plot
print(qqq)

ggsave("load.someforecast.png", plot = qqq, width = 10, height = 6, units = "in", dpi = 300)
# Forecast for whole of scotland 
historical_scotland <- load %>%
  group_by(year, month) %>%
  summarize(total_attendances = sum(attendances, na.rm = TRUE)) %>%
  mutate(yearmon = as.yearmon(paste(year, month, sep = "-")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Combine historical data with forecasts along with the CI
load.scotland.full <- bind_rows(
  historical_scotland %>% mutate(source = "Actual"),
  load.scotland.forecast %>% select(yearmon, total_attendances, lower, upper) %>% 
    mutate(source = "Forecast")
)
# Filter the data to use historical data from 2023
load.scotland.full.2023 <- load.scotland.full %>%
  filter(yearmon >= as.yearmon("2023-01"))

scotland_plot <- ggplot(load.scotland.full.2023, aes(x = yearmon, y = total_attendances, color = source)) +
  geom_line(data = load.scotland.full.2023 %>% filter(source == "Actual"), size = 0.3) +
  geom_line(data = load.scotland.full.2023 %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") +
  geom_ribbon(data = load.scotland.full.2023 %>% filter(source == "Forecast"),
              aes(x = yearmon, ymin = lower, ymax = upper), alpha = 0.2, inherit.aes = FALSE) +
  labs(
    title = "Total A&E Attendances and Forecasts for Scotland",
    x = "Date (Year-Month)",
    y = "Total Attendances"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",  
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15)),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)  
  ) +
  scale_x_yearmon(format = "%Y-%m", n = 6, breaks = seq(min(load.scotland.full.2023$yearmon), max(load.scotland.full.2023$yearmon), by = 1/12))

print(scotland_plot)

ggsave("scotlandforecast.png", plot = scotland_plot, width = 10, height = 6, units = "in", dpi = 300)

Waiting times

# Create a rolling window and 6 month test
train.wait.1 <- wait %>% filter(julian <= 3087)
train.wait.2 <- wait %>% filter(julian >= 56, julian <= 3143)
train.wait.3 <- wait %>% filter(julian >= 112, julian <= 3199)
test.wait.1 <- wait %>% filter(julian >= 3094, julian <= 3255)
test.wait.2 <- wait %>% filter(julian >= 3150, julian <= 3311)
test.wait.3 <- wait %>% filter(julian >= 3206)
# Train each model specification on each rolling window and examine performance on test dataset
list.train.wait <- list(train.wait.1, train.wait.2, train.wait.3)
list.test.wait <- list(test.wait.1, test.wait.2, test.wait.3)
knots.wait <- list(dom=c(0,31), month=c(0,12))

# Initialize lists for model fits and results
m1.wait.fit <- list(); m2.wait.fit <- list(); m3.wait.fit <- list()
wait.rmse.results <- list(m1 = list(), m2 = list(), m3 = list())
# Model 1 training
for (i in 1:3) {
  m1.wait.fit[[i]] <- bam(cbind(w4hrs, gr4hrs) ~ covid + HBT + s(loc, k=30, bs="re") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=list.train.wait[[i]], knots=knots.wait)
  
  wait.rmse.results$m1[[i]] <- list(rmse = rmse.wait(m1.wait.fit[[i]], list.test.wait[[i]]),data_trained_on = paste("train.wait.", i, sep=""))
}
# Model 2 training
for(i in 1:3) {
  m2.wait.fit[[i]] <- bam(cbind(w4hrs, gr4hrs) ~ covid + s(year, k=8, bs="cr") + s(loc, k=30, bs="re") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=list.train.wait[[i]], knots=knots.wait)
  
  wait.rmse.results$m2[[i]] <- list(rmse = rmse.wait(m2.wait.fit[[i]], list.test.wait[[i]]),data_trained_on = paste("train.wait.", i, sep=""))
}  
# Model 3 training
for(i in 1:3) {
  m3.wait.fit[[i]] <- bam(cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k=8, bs="cr") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=list.train.wait[[i]], knots=knots.wait)
  
  wait.rmse.results$m3[[i]] <- list(rmse = rmse.wait(m3.wait.fit[[i]], list.test.wait[[i]]), data_trained_on = paste("train.wait.", i, sep=""))
}

Model Assessment

print(wait.rmse.results)
## $m1
## $m1[[1]]
## $m1[[1]]$rmse
## [1] 0.0911247
## 
## $m1[[1]]$data_trained_on
## [1] "train.wait.1"
## 
## 
## $m1[[2]]
## $m1[[2]]$rmse
## [1] 0.06265235
## 
## $m1[[2]]$data_trained_on
## [1] "train.wait.2"
## 
## 
## $m1[[3]]
## $m1[[3]]$rmse
## [1] 0.08818964
## 
## $m1[[3]]$data_trained_on
## [1] "train.wait.3"
## 
## 
## 
## $m2
## $m2[[1]]
## $m2[[1]]$rmse
## [1] 0.08645065
## 
## $m2[[1]]$data_trained_on
## [1] "train.wait.1"
## 
## 
## $m2[[2]]
## $m2[[2]]$rmse
## [1] 0.08607701
## 
## $m2[[2]]$data_trained_on
## [1] "train.wait.2"
## 
## 
## $m2[[3]]
## $m2[[3]]$rmse
## [1] 0.0613574
## 
## $m2[[3]]$data_trained_on
## [1] "train.wait.3"
## 
## 
## 
## $m3
## $m3[[1]]
## $m3[[1]]$rmse
## [1] 0.08633277
## 
## $m3[[1]]$data_trained_on
## [1] "train.wait.1"
## 
## 
## $m3[[2]]
## $m3[[2]]$rmse
## [1] 0.08598067
## 
## $m3[[2]]$data_trained_on
## [1] "train.wait.2"
## 
## 
## $m3[[3]]
## $m3[[3]]$rmse
## [1] 0.06127696
## 
## $m3[[3]]$data_trained_on
## [1] "train.wait.3"
# Calculate and store mean RMSE for each model specification
wait.mean.rmse <- list(
  m1 = mean(sapply(wait.rmse.results$m1, function(x) x$rmse)),
  m2 = mean(sapply(wait.rmse.results$m2, function(x) x$rmse)),
  m3 = mean(sapply(wait.rmse.results$m3, function(x) x$rmse))
)

print(wait.mean.rmse)
## $m1
## [1] 0.08065557
## 
## $m2
## [1] 0.07796169
## 
## $m3
## [1] 0.07786347
# Baseline forecast
baseline.rmse.wait.1 <- wait.baseline.rmse(train.wait.1, test.wait.1)
baseline.rmse.wait.2 <- wait.baseline.rmse(train.wait.2, test.wait.2)
baseline.rmse.wait.3 <- wait.baseline.rmse(train.wait.3, test.wait.3)

baseline.rmse.wait.1; baseline.rmse.wait.2; baseline.rmse.wait.3
## [1] 0.07517775
## [1] 0.0801787
## [1] 0.08241553
(baseline.rmse.wait.1 + baseline.rmse.wait.2 + baseline.rmse.wait.3)/3
## [1] 0.07925733
# Anova table using F-ratio test
anova(m1.wait.fit[[3]], m2.wait.fit[[3]], m3.wait.fit[[3]],test="F")
## Analysis of Deviance Table
## 
## Model 1: cbind(w4hrs, gr4hrs) ~ covid + HBT + s(loc, k = 30, bs = "re") + 
##     s(julian, k = 80, bs = "ad") + te(dom, month, k = c(30, 12), 
##     bs = c("cc", "cc"))
## Model 2: cbind(w4hrs, gr4hrs) ~ covid + s(year, k = 8, bs = "cr") + s(loc, 
##     k = 30, bs = "re") + s(julian, k = 80, bs = "ad") + te(dom, 
##     month, k = c(30, 12), bs = c("cc", "cc"))
## Model 3: cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k = 8, bs = "cr") + 
##     s(julian, k = 80, bs = "ad") + te(dom, month, k = c(30, 12), 
##     bs = c("cc", "cc"))
##   Resid. Df Resid. Dev        Df Deviance      F  Pr(>F)  
## 1     13060     216034                                    
## 2     13082     216657 -21.85610  -622.58 1.6588 0.02738 *
## 3     13082     216658  -0.12476    -0.61 0.2828 0.19747  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
appraise(m1.wait.fit[[1]])

appraise(m1.wait.fit[[2]])

appraise(m1.wait.fit[[3]])

appraise(m2.wait.fit[[1]])

appraise(m2.wait.fit[[2]])

appraise(m2.wait.fit[[3]])

appraise(m3.wait.fit[[1]])

appraise(m3.wait.fit[[2]])

appraise(m3.wait.fit[[3]])

6 Month ahead prediction

# Create forecast dataframe
wait.forecast <- test.wait.3[,-c(1,4:12,18:25)]
new_dates <- seq.Date(from = as.Date("2024-05-19"), by = "week", length.out = 24)
new_julian <- seq(3374, 3535, 7)

wait.forecast <- wait.forecast %>%
  group_by(loc) %>% 
  mutate(date = rep(new_dates, length.out = n()),  
         julian = rep(new_julian, length.out = n())) %>%
  ungroup()  

wait.forecast$year <- 2024
wait.forecast$month <- month(wait.forecast$date)
wait.forecast$dom <- day(wait.forecast$date)
# Rerun the model with the full dataset to make the forecast
m3.wait.full <- bam(cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k=8, bs="cr") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=wait, knots=knots.wait)

wait.forecasted.values <- cbind(wait.forecast, predict(m3.wait.full,wait.forecast,se=TRUE,type="response"))
summary(m3.wait.full)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k = 8, bs = "cr") + 
##     s(julian, k = 80, bs = "ad") + te(dom, month, k = c(30, 12), 
##     bs = c("cc", "cc"))
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.9469613  0.0210401  92.536  < 2e-16 ***
## covid        0.0638433  0.0641535   0.995  0.31967    
## locA210H    -0.2555106  0.0257185  -9.935  < 2e-16 ***
## locB120H    -0.0666821  0.0276830  -2.409  0.01602 *  
## locC121H     1.4713476  0.0760765  19.340  < 2e-16 ***
## locC313H     0.2822879  0.0296670   9.515  < 2e-16 ***
## locC418H    -0.3844698  0.0216505 -17.758  < 2e-16 ***
## locF704H     0.0153675  0.0223041   0.689  0.49084    
## locG107H    -0.4515662  0.0199947 -22.584  < 2e-16 ***
## locG405H    -0.7542473  0.0192873 -39.106  < 2e-16 ***
## locG513H     1.6576712  0.0320931  51.652  < 2e-16 ***
## locH103H     0.8805456  0.0588858  14.953  < 2e-16 ***
## locH202H     0.1374605  0.0271314   5.066  4.1e-07 ***
## locH212H     0.9413024  0.0594708  15.828  < 2e-16 ***
## locL106H    -0.0599118  0.0219877  -2.725  0.00644 ** 
## locL302H    -0.4698573  0.0213032 -22.056  < 2e-16 ***
## locL308H    -0.4832920  0.0207319 -23.312  < 2e-16 ***
## locN101H    -0.4729742  0.0219244 -21.573  < 2e-16 ***
## locN121H     1.4924297  0.0560049  26.648  < 2e-16 ***
## locN411H     0.3618316  0.0316816  11.421  < 2e-16 ***
## locR103H     1.4097837  0.0828073  17.025  < 2e-16 ***
## locS308H     0.0003079  0.0231784   0.013  0.98940    
## locS314H    -0.6932405  0.0186649 -37.141  < 2e-16 ***
## locS319H     1.4644176  0.0333140  43.958  < 2e-16 ***
## locT101H     1.2196729  0.0306209  39.831  < 2e-16 ***
## locT202H     1.4128056  0.0468019  30.187  < 2e-16 ***
## locV217H    -0.5957136  0.0212090 -28.088  < 2e-16 ***
## locW107H     2.2742350  0.1299101  17.506  < 2e-16 ***
## locY144H     0.8567995  0.0492861  17.384  < 2e-16 ***
## locY146H     0.2717760  0.0286812   9.476  < 2e-16 ***
## locZ102H     1.2926061  0.0743009  17.397  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf  Ref.df      F  p-value    
## s(year)        6.24   6.702  6.111 1.41e-05 ***
## s(julian)     61.06  66.964 44.626  < 2e-16 ***
## te(dom,month) 74.67 309.000  1.292  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.895   Deviance explained = 87.5%
## fREML =  41449  Scale est. = 17.261    n = 14460
appraise(m3.wait.full); draw(m3.wait.full, rug=NULL)

# Compute the confidence intervals
wait.forecasted.values <- wait.forecasted.values %>%
  mutate(lower_ci = fit - 1.96 * se.fit, 
         upper_ci = fit + 1.96 * se.fit)

# Bind the forecast with the data
wait.full <- bind_rows(
  wait %>% mutate(source = "Actual"),
  wait.forecasted.values %>% select(loc, date, fit, lower_ci, upper_ci) %>% 
    rename(prop.w4hrs = fit) %>%
    mutate(source = "Forecast")
)

Plotting the Forecast

wait.full.all.loc <- wait.full %>%
  filter(date >= as.Date("2023-01-01"))

wait.forecasted.values.plot <- wait.forecasted.values %>%
  mutate(source = "Forecast")

wait.full.plot <- bind_rows(
  wait.full.all.loc %>% mutate(source = "Actual"),
  wait.forecasted.values.plot %>% select(loc, date, fit, lower_ci, upper_ci, source)
)
pew <- ggplot(wait.full.plot, aes(x = date, y = prop.w4hrs, color = loc)) +
  geom_line(data = wait.full.plot %>% filter(source == "Actual"), size = 0.3) +  # Actual data lines
  geom_line(data = wait.full.plot %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") +  # Forecast data lines
  geom_ribbon(data = wait.full.plot %>% filter(source == "Forecast"), 
              aes(x = date, ymin = lower_ci, ymax = upper_ci, fill = loc), alpha = 0.2, inherit.aes = FALSE, show.legend = FALSE) +
  labs(
    title = "Forecast of the Percentage of Attendances within 4 Hours for All Locations",
    x = "Date",
    y = "Percentage of Attendances within 4 Hours"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15)),
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "none"
  ) +
  scale_x_date(date_labels = "%Y-%m", date_breaks = "6 months") +  
  facet_wrap(~ loc, scales = "free_y", ncol = 5) +  # 5x6 grid
  scale_y_continuous(labels = function(x) sprintf("%.1f", x), limits = c(0, 1))

# Print the plot
print(pew)
## Warning: Removed 720 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggsave("wait_forecast_all.png", plot = pew, width = 12, height = 8, dpi = 300)
## Warning: Removed 720 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Plot a selected few hospital locations
wait.full.plot.some <- wait.full.plot %>% 
    filter(loc %in% c("S314H", "C418H", "T101H", "C121H"))

pew_1 <- ggplot(wait.full.plot.some, aes(x = date, y = prop.w4hrs, color = loc)) +
  geom_line(data = wait.full.plot.some %>% filter(source == "Actual"), size = 0.3) +  # Actual data lines
  geom_line(data = wait.full.plot.some %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") +  # Forecast data lines
  geom_ribbon(data = wait.full.plot.some %>% filter(source == "Forecast"), 
              aes(x = date, ymin = lower_ci, ymax = upper_ci, fill = loc), alpha = 0.2, inherit.aes = FALSE, show.legend = FALSE) +
  labs(
    title = "Forecast of the Percentage of Attendances within 4 Hours in Selected Hospital Locations",
    x = "Date",
    y = "Percentage of Attendances within 4 Hours"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, margin = margin(t = 20)),
    axis.text = element_text(size = 8),
    strip.text = element_text(size = 8),
    axis.title.x = element_text(margin = margin(t = 15)),
    legend.title = element_blank(),
    legend.position = "none"
  ) +
  scale_x_date(date_labels = "%Y-%m", date_breaks = "6 months") +  
  facet_wrap(~ loc, scales = "free_y", ncol = 2) +  
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) 

# Print the plot
print(pew_1)
## Warning: Removed 96 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggsave("wait_forecast_someloc.png", plot = pew_1, width = 12, height = 8, dpi = 300)
## Warning: Removed 96 rows containing missing values or values outside the scale range
## (`geom_line()`).